map <-
  read_csv("data/HydroWASTE_v10.csv")
## Rows: 58502 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): WWTP_NAME, COUNTRY, CNTRY_ISO, STATUS, LEVEL
## dbl (20): WASTE_ID, SOURCE, ORG_ID, LAT_WWTP, LON_WWTP, QUAL_LOC, LAT_OUT, L...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(map) |>
  knitr::kable()
WASTE_ID SOURCE ORG_ID WWTP_NAME COUNTRY CNTRY_ISO LAT_WWTP LON_WWTP QUAL_LOC LAT_OUT LON_OUT STATUS POP_SERVED QUAL_POP WASTE_DIS QUAL_WASTE LEVEL QUAL_LEVEL DF HYRIV_ID RIVER_DIS COAST_10KM COAST_50KM DESIGN_CAP QUAL_CAP
1 1 1140441 Akmenes aglomeracija Lithuania LTU 56.247 22.726 2 56.223 22.627 Not Reported 1060 2 148.213 4 Advanced 1 2421.974 20228874 4.153 0 0 4600 2
2 1 1140443 Alytaus m aglomeracija Lithuania LTU 54.432 24.056 2 54.519 24.098 Not Reported 87900 2 8797.904 1 Advanced 1 2534.527 20261585 257.983 0 0 220000 2
3 1 1140445 Anyksciu aglomeracija Lithuania LTU 55.509 25.073 2 55.452 25.006 Not Reported 12400 2 1959.285 1 Advanced 1 1367.809 20243105 30.995 0 0 33000 2
4 1 1140447 Ariogalos aglomeracija Lithuania LTU 55.252 23.484 2 55.210 23.510 Not Reported 2500 2 578.482 1 Secondary 1 2061.969 20247446 13.799 0 0 4357 2
5 1 1140449 Baisogalos aglomeracija Lithuania LTU 55.644 23.741 2 55.681 23.835 Not Reported 1200 2 167.788 4 Secondary 1 209.549 20239330 0.405 0 0 1490 2
6 1 1140451 Birstono Prienu aglomeracija Lithuania LTU 54.623 24.062 2 54.715 24.094 Not Reported 12400 2 2239.471 1 Advanced 1 10366.240 20256987 268.665 0 0 19000 2
wwtp <-
  read_csv("data/SARS-CoV-2_concentrations_measured_in_NYC_Wastewater_20231129.csv")
## Rows: 4270 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Sample Date, Test date, WRRF Name, WRRF Abbreviation, Annotation, T...
## num (3): Concentration SARS-CoV-2 gene target (N1 Copies/L), Per capita SARS...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(wwtp) |>
  knitr::kable()
Sample Date Test date WRRF Name WRRF Abbreviation Concentration SARS-CoV-2 gene target (N1 Copies/L) Per capita SARS-CoV-2 load (N1 copies per day per population) Annotation Population Served, estimated Technology
08/31/2020 09/01/2020 26th Ward 26W 389 263535.6 Concentration below Method Limit of Quantification (above Method Limit of Detection) 290608 RT-qPCR
08/31/2020 09/01/2020 Bowery Bay BB 1204 443632.9 NA 924695 RT-qPCR
08/31/2020 09/01/2020 Coney Island CI 304 168551.6 Concentration below Method Limit of Quantification (above Method Limit of Detection) 682342 RT-qPCR
08/31/2020 09/01/2020 Hunts Point HP 940 574446.6 NA 755948 RT-qPCR
08/31/2020 09/01/2020 Jamaica Bay JA 632 233077.7 NA 748737 RT-qPCR
08/31/2020 09/01/2020 Newtown Creek NC 197 122396.8 Concentration below Method Limit of Quantification (above Method Limit of Detection) 1156473 RT-qPCR
map_new <-
  map |>
  janitor::clean_names() |>
  select(waste_id, wwtp_name, lat_wwtp, lon_wwtp) |>
  mutate(wrrf_abbreviation = recode( wwtp_name,
    "New York C   Rockaway WPCP" = "RK",
    "New York C   Red Hook WPCP" = "RH",
    "New York C   Port Richmond WPCP" = "PR",
    "New York C   Oakwood Beach WPCP" = "OB",
    "New York C   26th Ward WPCP" = "26W",
    "New York C   Tallman Island WPCP" = "TI",
    "New York C   North River WPCP" = "NR",
    "New York C   Coney Island WPCP" = "CI",
    "New York C   Jamaica WPCP" = "JA",
    "New York C   Hunts Point WPCP" = "HP",
    "New York C   Owls Head WPCP" = "OH",
    "New York C   Bowery Bay WPCP" = "BB",
    "New York C   Newtown Creek WPCP" = "NC",
    "New York C   Wards Island WPCP" = "WI",
  ))
wwtp_cleaned <-
  wwtp |>
  janitor:: clean_names()
merge_df <- inner_join(map_new, wwtp_cleaned, by = "wrrf_abbreviation")
nyc_wwtp <-
  merge_df |>
  rename(concentration = concentration_sars_co_v_2_gene_target_n1_copies_l) |>
  drop_na(concentration) |>
  separate(sample_date, into = c("month", "day", "year"), convert = TRUE) %>% 
   mutate(
     year = as.character(year),
     month = factor(month, levels = 1:12),
     month = recode(month,
                        "1" = "January",
                        "2" = "February",
                        "3" = "March",
                        "4" = "April",
                        "5" = "May",
                        "6" = "June",
                        "7" = "July",
                        "8" = "August",
                        "9" = "September",
                        "10" = "October",
                        "11" = "November",
                        "12" = "December")) |>
  select(-waste_id, -test_date, -per_capita_sars_co_v_2_load_n1_copies_per_day_per_population, -population_served_estimated, -wwtp_name)

nyc_wwtp
## # A tibble: 4,129 × 10
##    lat_wwtp lon_wwtp wrrf_abbreviation month   day year  wrrf_name concentration
##       <dbl>    <dbl> <chr>             <fct> <int> <chr> <chr>             <dbl>
##  1     40.8    -73.9 WI                Augu…    31 2020  Wards Is…          1339
##  2     40.8    -73.9 WI                Sept…     8 2020  Wards Is…          1065
##  3     40.8    -73.9 WI                Sept…    13 2020  Wards Is…           394
##  4     40.8    -73.9 WI                Sept…    15 2020  Wards Is…           316
##  5     40.8    -73.9 WI                Sept…    20 2020  Wards Is…          1250
##  6     40.8    -73.9 WI                Octo…     6 2020  Wards Is…          1152
##  7     40.8    -73.9 WI                Octo…    11 2020  Wards Is…           600
##  8     40.8    -73.9 WI                Octo…    13 2020  Wards Is…          2066
##  9     40.8    -73.9 WI                Octo…    18 2020  Wards Is…           444
## 10     40.8    -73.9 WI                Octo…    20 2020  Wards Is…          3026
## # ℹ 4,119 more rows
## # ℹ 2 more variables: annotation <chr>, technology <chr>

Creating the dataset for an overall trend for Covid-19 concentration across the 14 waste water facilities within New York State from 2020 to 2023.

overall_trend <-
  nyc_wwtp |>
  filter(technology == "RT-qPCR") |>
  select(-lat_wwtp, -lon_wwtp, -annotation) |>
  group_by(year, month) |>
  summarise(avg_conc = mean(concentration)) |>
  unite("year_month",year, month, sep = "_") |>
  mutate(Month = ymd(paste0(year_month, "_01")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

Plotting Scatterplot

overall_trend |>
  ggplot(aes(x = Month, y = avg_conc)) + 
  geom_point(color = c("#FFA500")) +
  geom_smooth(se = FALSE, color = "dark grey") +
    labs(
      x = "Time", 
      y = "Monthly average Covid-19 concentration in waste water in NY (N/L)") + 
  theme(axis.line = element_line(color = "grey"), 
      panel.background = element_blank(), 
      legend.position = "none", 
      panel.grid.major = element_line(color = "light grey", linetype = "dashed"),
      plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Monthly average Covid-19 concentration in waste water in NY measured from 2020 to 2023")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Histogram

overall_trend |>
  ggplot(aes(x = year_month, y = avg_conc)) +
  geom_col(position = "dodge") +
  stat_summary(fun = mean, geom = "line", aes(group = 1), color = "orange", size = 1) +
  labs(x = "Year-Month", y = "Average Concentration") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1, color = "black"))

Sub dataset

rtqpcr <-
  nyc_wwtp |>
  filter(year %in% c(2021, 2022), technology == "RT-qPCR") |>
  select(-lat_wwtp, -lon_wwtp, -annotation)

dpcr <-
  nyc_wwtp |>
  filter(year %in% c(2021, 2022), technology == "dPCR") |>
  select(-lat_wwtp, -lon_wwtp, -annotation)

dpcr is not large enough to do data analysis.

data analysis

trend_plot_2021 <-
  rtqpcr |>
  filter(year == 2021) |>
  group_by(month, wrrf_name) |>
  summarise(avg_conc = mean(concentration)) |>
  ggplot(aes(x = month, y = avg_conc, color = wrrf_name, group = wrrf_name)) +
  geom_line(alpha = .5) +
  geom_point(alpha = .5) +
  labs(x = "Month", 
       y = "Average Covid-19 Concentration of each waste water facility", 
       title = "Average Covid-19 Concentration of each waste water facility in 2021 in New York State") + 
  theme(legend.position = "bottom",
        legend.box.background = element_rect())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
trend_plot_2021

trend_plot_2022

trend_plot_2022 <-
  rtqpcr |>
  filter(year == 2022) |>
  group_by(month, wrrf_name) |>
  summarise(avg_conc = mean(concentration)) |>
  ggplot(aes(x = month, y = avg_conc, color = wrrf_name, group = wrrf_name)) +
  geom_line(alpha = .5) +
  geom_point(alpha = .5) +
  labs(x = "Month", 
       y = "Average Covid-19 Concentration of each waste water facility", 
       title = "Average Covid-19 Concentration of each waste water facility in 2022 in New York State") + 
  theme(legend.position = "bottom",
        legend.box.background = element_rect())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
trend_plot_2022

tables 2021

box_2021 <- rtqpcr %>%
  filter(year == 2021) %>%
  group_by(month, wrrf_name) %>%
  summarise(avg_conc = mean(concentration)) %>%
  plot_ly(
    x = ~avg_conc,
    y = ~wrrf_name,
    type = "box",
    color = ~wrrf_name,
    colors = "viridis"
  ) %>%
  layout(
    xaxis = list(title = "Mean concentration (N/L)"),
    yaxis = list(title = "Area"),
    showlegend = FALSE
  )
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
box_2021

tables 2022

box_2022 <- rtqpcr %>%
  filter(year == 2022) %>%
  group_by(month, wrrf_name) %>%
  summarise(avg_conc = mean(concentration)) %>%
  plot_ly(
    x = ~avg_conc,
    y = ~wrrf_name,
    type = "box",
    color = ~wrrf_name,
    colors = "viridis"
  ) %>%
  layout(
    xaxis = list(title = "Mean concentration (N/L)"),
    yaxis = list(title = "Area"),
    showlegend = FALSE
  )
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
box_2021
nyc_wwtp |>
  filter(year == 2022, technology == "RT-qPCR") |>
  group_by(month, wrrf_abbreviation) |>
  ggplot(aes(x = month, y = concentration, color = wrrf_abbreviation)) +
  geom_point(alpha = .5)

library(leaflet)
wwtp_map <-
  nyc_wwtp |>
  leaflet() |>
  addTiles() |>
  addMarkers(
    ~lon_wwtp, ~lat_wwtp
  )

wwtp_map

14 wwtps map

##Questions 1 The concentration of covid for each station on average 2 literature review foe two diff tech (RT-qPCR, dPCR) 3 Year 2021 and 2022